* Reset settings and initialize log file
launch, path("share/job_decomp")

*-------------------------------------------------------------------------------
* Price and Wasserman (2024), "The Summer Drop in Female Employment"
*
* Description: Decompose gender gap in labor market seasonality into between-
*              and within-job components.
*-------------------------------------------------------------------------------


* List of decomposition components
local components overall sorting_sector sorting_edujobs sorting_othjobs within_edujobs within_othjobs baseline


* Prepare data and run models
*-------------------------------------------------------------------------------

if "$estimate" != "0" {
	foreach v in "emp" "atw" {

		* Prepare the data
		*-----------------

		* Load data on adult individuals
		gzuse "$basepath/data/derived/cps_bms_sample.dta.gz", clear

		* Retain variables we need
		keep pid hid tm year month wtfinl wtraked tmspline* weeks female emp empstat school ind1990 occ1990

		* Specify outcome variable
		if "`v'" == "emp" {
			drop empstat
		}
		else if "`v'" == "atw" {
			gen byte atw = (empstat == 10)
			drop emp empstat
		}

		* De-populate industry and occupation for non-working individuals
		replace school = . if `v' == 0
		replace ind1990 = . if `v' == 0
		replace occ1990 = . if `v' == 0

		* Assign occupation-industry pairs to job categories
		classify_jobs

		* Store a list of job types
		quietly levelsof job, local(jobs)
		local J : word count `jobs'

		* Identify the lowest job category coded as education
		quietly sum job if school == 1 & !missing(job)
		local E = r(min)

		* Record lagged status
		gen l_`v' = L.`v'
		gen l_job = L.job

		* Restrict to linked observations
		keep if !missing(wtraked)
		assert !missing(`v', l_`v')

		* Measure work (e), inflows (f), and outflows (s)
		gen byte e_j0 = `v'
		gen byte f_j0 = (l_`v' == 0 & `v' == 1)
		gen byte s_j0 = (l_`v' == 1 & `v' == 0)
		drop `v' l_`v'

		* Distinguish work/flows by job type
		foreach j in `jobs' {
			gen byte e_j`j' = e_j0 * (job == `j')
			gen byte f_j`j' = f_j0 * (job == `j')
			gen byte s_j`j' = s_j0 * (l_job == `j')
		}

		* Store the data
		tempfile core
		save `core'

		* Compute the average share of each group working in each job type at baseline
		use `core', clear
		keep if month == 5
		gcollapse (mean) e_j* [pw = wtraked], by(female)
		reshape long e_j, i(female) j(job)
		rename female g
		rename job j
		rename e_j e
		label variable g "Gender (0 = men, 1 = women)"
		label variable j "Job type (0 = aggregate)"
		label variable e "Share working in base month"
		compress
		save "$basepath/models/job_decomp/`v'_eshares.dta", replace

		* Aggregate flows across individuals
		use `core', clear
		gcollapse (mean) f_j* s_j* (first) month tmspline* weeks (rawsum) wtfinl [pw = wtraked], by(female tm)
		tsset female tm

		* Switch from gross flows to net flows
		foreach j in 0 `jobs' {
			gen n_j`j' = f_j`j' - s_j`j'
			drop f_j`j' s_j`j'
		}

		* Verify additivity
		quietly gen check = 0
		foreach j in `jobs' {
			quietly replace check = check + n_j`j'
		}

		assert abs(n_j0 - check) < .001
		drop check


		* Estimate the seasonal cycle in net flows separately by sex
		*-----------------------------------------------------------

		foreach g of numlist 0 1 {
			* Estimate specification
			ivreg2 n_j0 ib5.month D.tmspline* D.weeks if female == `g' [aw = wtfinl], bw($bandwidth) cluster(tm) small
			process_estimates, path("job_decomp") model("`v'_overall_f`g'")

			* Average the flow coefficients
			local beta_bar
			forvalues m = 1/12 {
				local beta_bar `"`beta_bar'_b[coef`m']"'
				if `m' < 12 local beta_bar `"`beta_bar' + "'
			}

			* Compute excess flows
			local coeflist
			forvalues m = 0/12 {
				local coeflist "`coeflist' (excess`m' = _b[coef`m'] - 1/12 * (`beta_bar'))"
			}

			* Transform estimates
			quietly xlincom `coeflist', post
			process_estimates, path("job_decomp") model("`v'_overall_f`g'_excess") raw
		}

		* Discard the aggregate category
		drop n_j0

		* Stack the data across sex and outcomes
		expand = `J'
		bysort female tm: gen byte stackid = _n
		replace stackid = stackid + `J' if female == 1

		* Declare "panel" data
		xtset stackid tm

		* Populate outcome and regressor variables
		gen flows = .
		forvalues k = 1/`=2 * `J'' {
			* Back out the job type
			if `k' <= `J' local j = `k'
			if `k' > `J' local j = `k' - `J'

			* Populate a stacked outcome variable
			replace flows = n_j`j' if stackid == `k'

			* Populate month (setting to the base month for off-diagonal stacks)
			quietly gen byte month`k' = .
			quietly replace month`k' = month if stackid == `k'
			quietly replace month`k' = 5 if stackid != `k'

			* Populate control variables (setting to zero for off-diagonal stacks)
			foreach x of varlist tmspline? weeks {
				quietly gen `x'_stack`k' = `x' * (stackid == `k')
			}
		}

		* Discard deprecated variables
		drop n_j* month tmspline? weeks

		* Estimate the stacked specification
		ivreg2 flows i.stackid ib5.month* D.tmspline* D.weeks* [aw = wtfinl], bw($bandwidth) cluster(tm) small
		process_estimates, path("job_decomp") model("`v'_stacked_raw") raw


		* Transform estimates into excess flows
		*--------------------------------------

		* Construct a list of new parameters
		local coeflist
		foreach g of numlist 0 1 {
			foreach j of numlist 1/`J' {
				* Compute stack ID
				local k = `j' + (`J' * `g')

				* Average the flow coefficients
				local beta_bar
				forvalues m = 1/12 {
					local beta_bar `"`beta_bar'_b[`m'.month`k']"'
					if `m' < 12 local beta_bar `"`beta_bar' + "'
				}

				* Compute excess flows
				forvalues m = 0/12 {
					* Normalize months so that May = 0
					local mm = mod(`m' + 5, 12) + 12 * (mod(`m' + 5, 12) == 0)

					* Add to the list of transformed parameters
					local coeflist "`coeflist' (g`g'_j`j'_m`m' = _b[`mm'.month`k'] - 1/12 * (`beta_bar'))"
				}
			}
		}

		* Transform estimates
		quietly xlincom `coeflist', post
		process_estimates, path("job_decomp") model("`v'_stacked_excess") raw


		* Prepare constants used in the decomposition
		*--------------------------------------------

		* Baseline share of each group's population working in each job
		use "$basepath/models/job_decomp/`v'_eshares.dta", clear
		foreach g of numlist 0 1 {
			foreach j of numlist 0/`J' {
				quietly sum e if g == `g' & j == `j'

				* Overall EPOP
				if `j' == 0 {
					local e_g`g' = r(mean)
				}

				* Job-specific EPOP
				else {
					local e_g`g'_j`j' = r(mean)
				}
			}
		}

		* Baseline EPOP, averaged across genders
		local e_avg = (`e_g0' + `e_g1')/2

		* Gender gap in baseline EPOP
		local e_gap = `e_g1' - `e_g0'

		* Initialize sectoral shares of each group's work
		local phi_g0_edu = 0
		local phi_g0_oth = 0
		local phi_g1_edu = 0
		local phi_g1_oth = 0

		* Loop over job types
		foreach j of numlist 1/`J' {
			* Distinguish education vs. non-education jobs
			if `j' < `E' local S "oth"
			if `j' >= `E' local S "edu"

			* Loop over genders
			foreach g of numlist 0 1 {
				* Compute each job's share of each group's work
				local phi_g`g'_j`j' = `e_g`g'_j`j''/`e_g`g''

				* Tally these work shares within each sector
				local phi_g`g'_`S' = `phi_g`g'_`S'' + `phi_g`g'_j`j''
			}

			* Job share of population, averaged across genders
			local e_j`j' = (`e_g0_j`j'' + `e_g1_j`j'')/2

			* Job share of work, averaged across genders
			local phi_j`j' = (`phi_g0_j`j'' + `phi_g1_j`j'')/2

			* Gender differences in share of work in each job/sector
			local phi_gap_j`j' = `phi_g1_j`j'' - `phi_g0_j`j''
		}

		* Gender differences in share of work in each job/sector
		local phi_gap_edu = `phi_g1_edu' - `phi_g0_edu'
		local phi_gap_oth = `phi_g1_oth' - `phi_g0_oth'


		* Construct decomposition expressions
		*------------------------------------

		local overall_list
		local within_edujobs_list
		local within_othjobs_list
		local sorting_sector_list
		local sorting_edujobs_list
		local sorting_othjobs_list
		local baseline_list

		* Assemble a list of (M - 1) x J decomposition components
		forvalues m = 1/11 {
			* Compute measures of excess net flows in education vs. non-education
			local L_edu "0"
			local L_oth "0"

			foreach j of numlist 1/`J' {
				* Distinguish education vs. non-education jobs
				if `j' < `E' local S "oth"
				if `j' >= `E' local S "edu"

				* Gender gap in excess net flows
				local l_g0_j`j' "_b[g0_j`j'_m`m']/`e_g0_j`j''"
				local l_g1_j`j' "_b[g1_j`j'_m`m']/`e_g1_j`j''"
				local l_j`j' `"(`l_g0_j`j'' + `l_g1_j`j'')/2"'

				* Job share of sectoral work, averaged across genders
				local phishare_avg_j`j' = (`phi_g0_j`j''/`phi_g0_`S'' + `phi_g1_j`j''/`phi_g1_`S'')/2

				* Scaling term
				local scaling = `phishare_avg_j`j'' * `e_avg'

				* Rescaled measure of excess net flows in each job
				local L_`S' `"`L_`S'' + `scaling' * `l_j`j''"'
			}

			* Cumulate decomposition components across job types
			foreach j of numlist 1/`J' {
				* Distinguish education vs. non-education jobs
				if `j' < `E' local S "oth"
				if `j' >= `E' local S "edu"


				* Define intermediate parameters
				*-------------------------------

				* Normalize net excess flows by baseline work in each job type
				local l_g0_j`j' "_b[g0_j`j'_m`m']/`e_g0_j`j''"
				local l_g1_j`j' "_b[g1_j`j'_m`m']/`e_g1_j`j''"

				* Gender gap in excess net flows
				local l_gap_j`j' `"(`l_g1_j`j'' - `l_g0_j`j'')"'

				* Excess job flows in this job, averaged across genders
				local l_j`j' `"(`l_g0_j`j'' + `l_g1_j`j'')/2"'

				* Rescaled measure of excess net flows in each job
				local L_j `"(`e_g0' + `e_g1')/2 * `l_j`j''"'


				* Define decomposition terms
				*---------------------------

				* Parameter suffix
				local sfx "m`m'_j`j'"

				* Overall change in gender gap in EPOP
				local overall_list `"`overall_list' (overall_`sfx' = _b[g1_j`j'_m`m'] - _b[g0_j`j'_m`m'])"'

				* Within-job effects
				local within_`S'jobs_list `"`within_`S'jobs_list' (within_`S'jobs_`sfx' = `e_j`j'' * `l_gap_j`j'')"'

				* Sorting across education vs. non-education sector (doesn't involve granular job types)
				if `j' == 1 {
					local sorting_sector_list `"`sorting_sector_list' (sorting_sector_`sfx' = `phi_gap_edu' * (`L_edu') + `phi_gap_oth' * (`L_oth'))"'
				}

				* Sorting across jobs within a given sector
				local sorting_`S'jobs_list `"`sorting_`S'jobs_list' (sorting_`S'jobs_`sfx' = `phi_gap_j`j'' * (`L_j' - (`L_`S'')))"'

				* Baseline EPOP effect
				local baseline_list `"`baseline_list' (baseline_`sfx' = `phi_j`j'' * `e_gap' * `l_j`j'')"'
			}
		}


		* Estimate decomposition
		*-----------------------

		* Loop over the components of the decomposition
		foreach c of local components {
			* Progress report
			display "`c' ($S_DATE $S_TIME)"

			* Reload estimates
			estimates use "$basepath/models/job_decomp/`v'_stacked_excess.ster"

			* Estimate (M - 1) x J decomposition parameters
			quietly xlincom ``c'_list', post

			* Cumulate parameters over months and across job types
			local coeflist
			forvalues m = 1/11 {
				local coef "0"
				forvalues n = 1/`m' {
					forvalues j = 1/`J' {
						* Skip irrelevant components
						if "`c'" == "sorting_sector" & `j' != 1 continue
						if "`c'" == "sorting_edujobs" & `j' < `E' continue
						if "`c'" == "sorting_othjobs" & `j' >= `E' continue
						if "`c'" == "within_edujobs" & `j' < `E' continue
						if "`c'" == "within_othjobs" & `j' >= `E' continue

						* Augment list of coefficients to be summed
						local coef `"`coef' + _b[`c'_m`n'_j`j']"'
					}
				}

				local coeflist `"`coeflist' (`c'_m`m' = `coef')"'
			}

			quietly xlincom `coeflist', post

			* Save the finalized estimation object
			estimates save "$basepath/models/job_decomp/`v'_decomp_`c'.ster", replace
		}
	}
}


* Plot and tabulate the decomposition results
*-------------------------------------------------------------------------------

foreach v in "emp" "atw" {

	* Create a dataset of decomposition components
	*---------------------------------------------

	* Turn each component into a dataset
	foreach c of local components {
		* Load estimates
		estimates use "$basepath/models/job_decomp/`v'_decomp_`c'.ster"

		* Extract components and their variances
		matrix B = e(b)
		matrix V = vecdiag(e(V))
		matrix BV = B \ V
		clear
		quietly svmat BV, names(col)
		gen k = _n
		reshape long `c', i(k) j(month_str) string
		gen byte month = real(substr(month_str, 3, .))

		* Compute standard error of components
		assert `c' >= 0 if k == 2
		replace `c' = sqrt(`c') if k == 2

		* Express estimates and standard errors in percentage terms
		replace `c' = 100 * `c'

		* Place estimates and standard errors side by side
		keep month k `c'
		reshape wide `c', i(month) j(k)
		rename `c'1 `c'_beta
		rename `c'2 `c'_stde

		* Add rows for the base month
		set obs `=_N + 2'
		replace month = 0 in -2/-2
		replace month = 12 in -1/-1
		replace `c'_beta = 0 if inlist(month, 0, 12)
		replace `c'_stde = 0 if inlist(month, 0, 12)
		sort month

		* Store for stacking
		tempfile `c'_dta
		save ``c'_dta'
	}

	* Combine components
	foreach c of local components {
		if "`c'" == "overall" {
			use ``c'_dta', clear
		}
		else {
			quietly merge 1:1 month using ``c'_dta', assert(3) nogenerate
		}
	}

	* Verify that subcomponents sum to total
	gen overall_check = 0
	replace overall_check = overall_check + sorting_sector_beta
	replace overall_check = overall_check + sorting_edujobs_beta
	replace overall_check = overall_check + sorting_othjobs_beta
	replace overall_check = overall_check + within_edujobs_beta
	replace overall_check = overall_check + within_othjobs_beta
	replace overall_check = overall_check + baseline_beta
	assert abs(overall_beta - overall_check) < .00001
	drop overall_check


	* Plot the decomposition results
	*-------------------------------

	* Bar positioning
	gen curr_pos = 0
	gen curr_neg = 0

	* Loop over components in the order we'll cumulate them
	foreach c in "sorting_sector" "sorting_edujobs" "sorting_othjobs" "within_edujobs" "within_othjobs" "baseline" {
		gen `c'0 = .
		replace `c'0 = curr_pos if sign(`c'_beta) >= 0
		replace `c'0 = curr_neg if sign(`c'_beta) < 0

		gen `c'1 = `c'0 + `c'_beta
		replace curr_pos = curr_pos + `c'_beta if `c'_beta > 0
		replace curr_neg = curr_neg + `c'_beta if `c'_beta < 0
	}

	drop curr_*

	* Prepare shading
	if "`v'" == "emp" {
		local ymin -1.5
		local ymax +1.5
		local ylbl "-1.5(0.5)1.5"
	}
	else if "`v'" == "atw" {
		local ymin -5
		local ymax +2
		local ylbl "-5(1)2"
	}

	gen rlow = `ymin'
	gen rupp = `ymax'
	gen rval = month - 0.5

	* Choose colors
	if "`v'" == "emp" {
		local col1 "color(black)"
		local col2 "color(gs4)"
		local col3 "color(gs7)"
		local col4 "color(gs10)"
		local col5 "color(gs13)"
		local col6 "color(gs16)"
	}
	else if "`v'" == "atw" {
		local col1 "color($col1) fcolor(*1.30)"
		local col2 "color($col2) fcolor(*1.10)"
		local col3 "color($col3) fcolor(*0.90)"
		local col4 "color($col4) fcolor(*0.60)"
		local col5 "color($col5) fcolor(*0.35)"
		local col6 "color($col6) fcolor(*0.35)"
	}

	#delimit ;
	twoway
		(rarea rupp rlow rval if inrange(month, 0, 5), color($ltgs) plotregion(margin(t=0 b=0 l=0)))
		(scatteri 0 -0.5 0 12.5, recast(line) color(black))
		(rbar sorting_sector0 sorting_sector1 month, `col1' lcolor(black) lwidth(vthin))
		(rbar sorting_edujobs0 sorting_edujobs1 month, `col2' lcolor(black) lwidth(vthin))
		(rbar sorting_othjobs0 sorting_othjobs1 month, `col3' lcolor(black) lwidth(vthin))
		(rbar within_edujobs0 within_edujobs1 month, `col4' lcolor(black) lwidth(vthin))
		(rbar within_othjobs0 within_othjobs1 month, `col5' lcolor(black) lwidth(vthin))
		(rbar baseline0 baseline1 month, `col6' lcolor(black) lwidth(vthin))
		(connected overall_beta month, color(black) msymbol(circle) clwidth(medthick)),
		title("")
		xtitle("")
		xscale(range(0.5 12.5))
		xlabel(0 "May" 1 "June" 2 "July" 3 "Aug." 4 "Sept." 5 "Oct." 6 "Nov." 7 "Dec." 8 "Jan." 9 "Feb." 10 "Mar." 11 "Apr." 12 "May", alternate)
		ytitle("{&Delta} in gender gap since May (p.p.)")
		yscale(range(`ymin' `ymax'))
		ylabel(`ylbl')
		yline(0)
		legend(rows(4) symxsize(*0.6) colfirst order(9 3 4 5 - 6 7 8)
			label(3 "1. Sorting into the education sector")
			label(4 "2. Sorting across occs inside ed.")
			label(5 "3. Sorting across occs outside ed.")
			label(6 "4. Within occs inside ed.")
			label(7 "5. Within occs outside ed.")
			label(8 "6. Baseline scaling term")
			label(9 "Overall gender gap"));
	#delimit cr

	nicepdf "$basepath/output/job_decomp_`v'.pdf", indirect replace


	* Create a table showing point estimates and standard errors
	*-----------------------------------------------------------

	* Organize the estimates
	keep month *_beta *_stde
	rename *_beta beta*
	rename *_stde stde*
	reshape long beta stde, i(month) j(component) string
	rename beta vbeta
	rename stde vstde
	reshape long v, i(month component) j(parameter) string
	reshape wide v, i(component parameter) j(month)

	* List the components in the desired order
	gen k = .
	replace k = 0 if component == "overall"
	replace k = 1 if component == "sorting_sector"
	replace k = 2 if component == "sorting_edujobs"
	replace k = 3 if component == "sorting_othjobs"
	replace k = 4 if component == "within_edujobs"
	replace k = 5 if component == "within_othjobs"
	replace k = 6 if component == "baseline"
	sort k parameter

	* Format the estimates
	forvalues m = 0/12 {
		gen v`m'_str = string(v`m', "%9.3f")
		drop v`m'
		rename v`m'_str v`m'
		replace v`m' = "(" + v`m' + ")" if parameter == "stde"
	}

	* Write the header
	capture file close decomp
	file open decomp using "$basepath/output/job_decomp_`v'.tex", write replace
	file write decomp "\definecolor{grayshading}{gray}{0.9}" _n
	file write decomp "\newcolumntype{g}{>{\columncolor{grayshading}}c}" _n
	file write decomp "\begin{tabularx}{\textwidth}{p{2in}gggggcccccccc}" _n
	file write decomp "\toprule" _n
	file write decomp "\textbf{Component} & \textbf{May} & \textbf{June} & \textbf{July} & \textbf{Aug.} & \textbf{Sept.} & \textbf{Oct.} & \textbf{Nov.} & \textbf{Dec.} & \textbf{Jan.} & \textbf{Feb.} & \textbf{Mar.} & \textbf{Apr.} & \textbf{May} \\" _n
	file write decomp "\midrule" _n

	* Prepare the numerical cells
	forvalues r = 1/14 {
		local row`r'
		forvalues m = 0/12 {
			if `m' == 2 {
				local row`r' `"`row`r'' & `=v`m'[`r']'"'
			}
			else {
				local row`r' `"`row`r'' & `=v`m'[`r']'"'
			}
		}
	}

	* Write the rows
	file write decomp "Overall change in gender gap           `row1' \\" _n
	file write decomp "                                       `row2' \\[1em]" _n
	file write decomp "Sorting into the education sector      `row3' \\" _n
	file write decomp "                                       `row4' \\[1em]" _n
	file write decomp "Sorting across occupations inside ed.  `row5' \\" _n
	file write decomp "                                       `row6' \\[1em]" _n
	file write decomp "Sorting across occupations outside ed. `row7' \\" _n
	file write decomp "                                       `row8' \\[1em]" _n
	file write decomp "Within occupations inside ed.          `row9' \\" _n
	file write decomp "                                       `row10' \\[1em]" _n
	file write decomp "Within occupations outside ed          `row11' \\" _n
	file write decomp "                                       `row12' \\[1em]" _n
	file write decomp "Baseline scaling component             `row13' \\" _n
	file write decomp "                                       `row14' \\" _n

	* Close out the table
	file write decomp "\bottomrule" _n
	file write decomp "\end{tabularx}"
	file close decomp


	* Report the share of the gender gap explained by each component
	*---------------------------------------------------------------

	* Retain point estimates for July
	keep if parameter == "beta"
	keep component v2

	* Report each component's share of the total
	gen share = 100 * real(v2)/real(v2[1])
	format %9.1f share
	list
}

* Close the log file
unlaunch
